import json import time start_time = time.time() with open('/Volumes/TobiBackupSmall/insight/reddit_all', 'r') as f : results = [] line = f.readline() i = 0 while line: # quick test to see if this might be in the depression subreddit if 'autism' in line[:1000] : obj = json.loads(line) if obj['subreddit']=='autism' : #print(obj['num_comments']) i += 1 results.append(obj) line = f.readline() print("--- %s seconds ---" % (time.time() - start_time))

In [1]:
import os
import pandas as pd
os.chdir('../data/')
import pickle
#pickle.dump( results , open( "reditt_autism.p", "wb" ) )
results = pickle.load( open( "reditt_autism.p", "rb" ) )

In [2]:
len(results)


Out[2]:
4940
results[0]

In [3]:
columns=['post id','title','text','href','user id']
df = pd.DataFrame(columns=columns)

columns=['user description']
df_users = pd.DataFrame(columns=columns)
df_users.index.name="user id"

In [4]:
def add_post(post_id,title,text,url,user_name):
    global df,df_users
    # Update user dataframe:
    #
    newrow={"user description":user_name}
    if user_name not in df_users['user description'].values:
        df_users.loc[len(df_users)]=newrow
    #
    user_id = df_users.loc[df_users['user description'] == user_name ].index.values[0]
    #
    # Add post data to dataframe
    #
    newrow={"post id":post_id,
            "title":title,
            "text":text,
            "href":url,
            "user id":user_id}
    df.loc[len(df.values)]=newrow

In [5]:
post_id=0
not_shown=""
for result in results:
    url=result['url']
    title=result['title']
    text=result['selftext']
    user_name=result['author']
    if len(text) > 3:
        post_id=post_id+1
        add_post(post_id,title,text,url,user_name)
    else:
        not_shown=not_shown+text.strip()
not_shown


Out[5]:
'...` `'

In [6]:
df.head(2)


Out[6]:
post id title text href user id
0 1 Sulfur and sulfates question. I have been subbing in a special education cla... http://www.reddit.com/r/autism/comments/1xuenq... 0
1 2 Feel like son is missing out I want to take him to ball games movie ect but... http://www.reddit.com/r/autism/comments/1xtt96... 1

In [7]:
df_users.head(2)


Out[7]:
user description
user id
0 baylorhawkeye
1 Wellpaidrichguy

In [8]:
print(len(df))
df.to_csv('reditt-posts.csv',index=False)
df_users.to_csv('reditt-users.csv')


1527